In [1]:
import prody as pr
import py3Dmol
import tempfile
# --- notebook parameters (non-interactive) ---
import os, sys, asyncio
from pathlib import Path
import pandas as pd
from Analysis.PlotUtils import *
from scipy.ndimage import binary_dilation
import matplotlib.patches as mpatches
from IPython.display import display, HTML
from Bio import PDB
from Bio import pairwise2
from Bio.PDB import Superimposer
import py3Dmol
import ipywidgets as widgets
from Bio.PDB import PDBParser
from Bio import pairwise2
import py3Dmol
from Bio import PDB, Align
from Bio.PDB import PDBParser, PDBIO
# from Bio.PDB.Polypeptide import protein_letters_3to1
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.patches import Patch
from config import *
from utils.utils import *
from utils.protein_utils import extract_protein_sequence
from utils.protein_plot_utils import *
CONFIG FOLD-SWITCH PIEPLINE WITH USER: orzuk ENVIRONMENT: Linux
PyRosetta not installed; skipping energy/ΔG panels
In [2]:
# Silence Windows asyncio warning from zmq/tornado
if sys.platform.startswith("win"):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# Get pair from environment (set by generate_notebooks.py)
fold_pair = os.getenv("PAIR_ID", "").strip()
if not fold_pair:
# We are likely running interactively in Jupyter UI (not nbconvert).
# Fall back to input() ONLY when stdin is available.
try:
fold_pair = input("Enter fold-switch pair (e.g., 3hdfA_3hdeA): ").strip()
except Exception:
raise RuntimeError(
"PAIR_ID not provided and no stdin available. "
"When running via generate_notebooks.py, PAIR_ID is set automatically."
)
# Normalize and split into the two PDB+chain ids
if "_" not in fold_pair:
raise ValueError(f"PAIR_ID must look like 'A_B', got: {fold_pair}")
pdb1, pdb2 = fold_pair.split("_", 1)
print("Using pair:", pdb1, pdb2)
Using pair: 4tsyD 3zwgN
In [3]:
plot_tool = PlotTool(folder=DATA_DIR, fold_pair=fold_pair)
@> 5982 atoms and 1 coordinate set(s) were parsed in 0.12s.
@> 20835 atoms and 1 coordinate set(s) were parsed in 0.23s.
In [4]:
VIZ_FOLDER = f'{DATA_DIR}/{fold_pair}/output_cmap_esm/VizCmaps'
print("Fold-pair,", fold_pair, flush=True)
chains = (pdb1[-1].upper(), pdb2[-1].upper())
print("chains,", chains, flush=True)
pdb_path1 = f"{DATA_DIR}/{fold_pair}/{pdb1[:-1]}.pdb"
pdb_path2 = f"{DATA_DIR}/{fold_pair}/{pdb2[:-1]}.pdb"
#fold_pair_subdir = f"{fold_pair[0]}_{fold_pair[1]}"
Fold-pair, 4tsyD_3zwgN
chains, ('D', 'N')
In [5]:
seq_fold1 = extract_protein_sequence(pdb_path1, chain=chains[0], ca_only=False)
seq_fold2 = extract_protein_sequence(pdb_path2, chain=chains[1], ca_only=False)
Visualizations and results¶
Original Structure Visualization¶
In [6]:
pdb_file1 = read_pdb_file(pdb_path1)
plot_tool.plot_single_fold(pdb_file1,label=plot_tool.fold1)
pdb_file2 = read_pdb_file(pdb_path2)
plot_tool.plot_single_fold(pdb_file2,label=plot_tool.fold2)
3Dmol.js failed to load for some reason. Please check your browser console for error messages.
3Dmol.js failed to load for some reason. Please check your browser console for error messages.
In [7]:
try:
plot_tool.plot_fold_alignement_(pdb_path1,pdb_path1)
except:
print('Structural Alignment Plot fail !')
@> 5982 atoms and 1 coordinate set(s) were parsed in 0.05s.
@> 5982 atoms and 1 coordinate set(s) were parsed in 0.05s.
4tsy_TEMP
Blue:4tsy Red:4tsy
3Dmol.js failed to load for some reason. Please check your browser console for error messages.
AlphaFold predictions¶
Best AF prediction to fold 1¶
In [8]:
from pathlib import Path
df_af = load_csv_or_none(f"{plot_tool.folder}/{fold_pair}/Analysis/df_af.csv")
if df_af is not None:
# keep your existing filters
df_af = df_af[df_af.cluster_num != 'Query'].iloc[:, 1:-1]
print(f"[run] AlphaFold df ready: {len(df_af)} rows")
else:
print("[skip] AlphaFold section (df_af.csv not available)")
[ok] loaded: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/4tsyD_3zwgN/Analysis/df_af.csv rows=44
[run] AlphaFold df ready: 44 rows
In [9]:
MAX_AF_1, max_af_pdb1 = '', ''
if df_af is not None and not df_af.empty:
try:
max_af_pdb1 = df_af[df_af.score_pdb1 > df_af.score_pdb2].sort_values(by='score_pdb1',ascending=False).pdb_file.iloc[0]
max_af_pdb1 = f'{plot_tool.folder}/{fold_pair}/AF_preds/{max_af_pdb1}'
try:
MAX_AF_1 = max_af_pdb1.split('/')[-1][11:14]
except Exception as e:
print(e)
score = df_af[df_af.score_pdb1 > df_af.score_pdb2].sort_values(by='score_pdb1',ascending=False).score_pdb1.iloc[0]
plot_tool.align_and_visualize_pdb(max_af_pdb1,pdb_path1,score)
except Exception as e2:
print(f"[skip] AF fold1 alignment: {e2}")
try:
plot_tool.plot_fold_alignement_(max_af_pdb1,pdb_path1)
except Exception as e3:
print(f"[skip] AF fold1 fallback visualization: {e3}")
else:
print("[skip] AF fold1: df_af not available")
[skip] AF fold1 alignment: 'DataFrame' object has no attribute 'score_pdb1' [skip] AF fold1 fallback visualization: is not a valid filename or a valid PDB identifier.
Best AF prediction to fold 2¶
In [10]:
MAX_AF_2, max_af_pdb2 = '', ''
if df_af is not None and not df_af.empty:
try:
max_af_pdb2 = df_af[df_af.score_pdb1 < df_af.score_pdb2].sort_values(by='score_pdb2',ascending=False).pdb_file.iloc[0]
max_af_pdb2 = f'{plot_tool.folder}/{fold_pair}/AF_preds/{max_af_pdb2}'
try:
MAX_AF_2 = max_af_pdb2.split('/')[-1][11:14]
except Exception as e:
print(e)
score = df_af[df_af.score_pdb1 < df_af.score_pdb2].sort_values(by='score_pdb2',ascending=False).score_pdb2.iloc[0]
plot_tool.align_and_visualize_pdb(max_af_pdb2,pdb_path2,score)
except Exception as e2:
print(f"[skip] AF fold1 alignment: {e2}")
try:
plot_tool.plot_fold_alignement_(max_af_pdb2,pdb_path2)
except Exception as e3:
print(f"[skip] AF fold1 fallback visualization: {e3}")
else:
print("[skip] AF fold1: df_af not available")
[skip] AF fold1 alignment: 'DataFrame' object has no attribute 'score_pdb1' [skip] AF fold1 fallback visualization: is not a valid filename or a valid PDB identifier.
EsmFold Prediction¶
In [11]:
from pathlib import Path
import os, pandas as pd
# Inputs from the generator (below patch passes these env vars)
ROOT = Path(os.environ.get("MSACLUSTER_ROOT", ".")).resolve()
PAIR_ID = os.environ.get("PAIR_ID", None) or fold_pair # keep using fold_pair if already defined
PAIR_ANALYSIS = ROOT / "Pipeline" / PAIR_ID / "Analysis"
GLOBAL_RES = ROOT / "Pipeline_res"
def _read_csv_or_none(p: Path):
return pd.read_csv(p) if p.exists() and p.stat().st_size > 1 else None
# Prefer per-pair CSV; only fall back to global CSV if really needed
df_esmfold_analysis = _read_csv_or_none(PAIR_ANALYSIS / "df_esm.csv")
if df_esmfold_analysis is None:
df_esmfold_analysis = _read_csv_or_none(GLOBAL_RES / "df_esmfold_analysis.csv")
df_esmfold = None
if df_esmfold_analysis is not None:
# Your original filtering logic
df_esmfold = df_esmfold_analysis[df_esmfold_analysis.get("fold_pair", PAIR_ID) == PAIR_ID]
if "fold" in df_esmfold.columns:
df_esmfold = df_esmfold[df_esmfold["fold"].astype(str).str.contains("ShallowMsa")]
print(f"[run] ESMFold df ready: {len(df_esmfold)} rows")
display(df_esmfold.head())
else:
print("[skip] ESMFold section (no per-pair df_esm.csv and no global df_esmfold_analysis.csv)")
[run] ESMFold df ready: 200 rows
| fold_pair | model | cluster_num | name | TMscore_fold1 | TMscore_fold2 | TMdiff | |
|---|---|---|---|---|---|---|---|
| 0 | 4tsyD_3zwgN | esm2 | ShallowMsa_000 | ShallowMsa_000__sample_001 | 0.215869 | 0.067495 | 0.148374 |
| 1 | 4tsyD_3zwgN | esm2 | ShallowMsa_000 | ShallowMsa_000__sample_002 | 0.278811 | 0.091699 | 0.187112 |
| 2 | 4tsyD_3zwgN | esm2 | ShallowMsa_000 | ShallowMsa_000__sample_003 | 0.248704 | 0.083808 | 0.164896 |
| 3 | 4tsyD_3zwgN | esm2 | ShallowMsa_000 | ShallowMsa_000__sample_004 | 0.281325 | 0.089852 | 0.191473 |
| 4 | 4tsyD_3zwgN | esm2 | ShallowMsa_000 | ShallowMsa_000__sample_005 | 0.213490 | 0.083599 | 0.129891 |
Best EsmFold prediction to fold 1¶
In [12]:
if df_esmfold is not None and not df_esmfold.empty:
try:
# keep your original selection logic here (fixed snippets below are placeholders)
max_esm_pdb1 = df_esmfold.sort_values(by='TMscore_fold1', ascending=False).fold.iloc[0]
score = df_esmfold.sort_values(by='TMscore_fold1', ascending=False).TMscore_fold1.iloc[0]
plot_tool.align_and_visualize_pdb(pdb_path1, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb1}', score)
except Exception as e:
print(f"[skip] ESM fold1 align/plot: {e}")
try:
plot_tool.plot_fold_alignement_(pdb_path1, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb1}')
except Exception as e2:
print(f"[skip] ESM fold1 fallback visualization: {e2}")
else:
print("[skip] ESM fold1: df_esmfold not available")
[skip] ESM fold1 align/plot: 'DataFrame' object has no attribute 'fold' [skip] ESM fold1 fallback visualization: name 'max_esm_pdb1' is not defined
Best EsmFold prediction to fold 2¶
In [13]:
if df_esmfold is not None and not df_esmfold.empty:
try:
max_esm_pdb2 = df_esmfold.sort_values(by='TMscore_fold2', ascending=False).fold.iloc[0]
score = df_esmfold.sort_values(by='TMscore_fold2', ascending=False).TMscore_fold2.iloc[0]
plot_tool.align_and_visualize_pdb(pdb_path2, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb2}', score)
print(score)
except Exception as e:
print(f"[skip] ESM fold2 align/plot: {e}")
try:
print(score)
plot_tool.plot_fold_alignement_(pdb_path2, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb2}')
except Exception as e2:
print(f"[skip] ESM fold2 fallback visualization: {e2}")
else:
print("[skip] ESM fold2: df_esmfold not available")
[skip] ESM fold2 align/plot: 'DataFrame' object has no attribute 'fold' [skip] ESM fold2 fallback visualization: name 'score' is not defined
In [14]:
if df_esmfold is not None:
df_esmfold
else:
print("[skip] ESM df display (not available)")
Contact maps Msa Transformers¶
In [15]:
# Robust load of CMap CSV referenced by CMAP_RES_PATH (if defined)
df_cmap = None
try:
CMAP_RES_PATH # just to see if it exists
df_cmap = load_csv_or_none(CMAP_RES_PATH)
except NameError:
print("[skip] CMAP_RES_PATH is not defined in this notebook")
MAX_RECALL_1 = MAX_RECALL_2 = None
MAX_CLUSTER_RECALL_1 = MAX_CLUSTER_RECALL_2 = ''
if df_cmap is not None and not df_cmap.empty:
df_cmap = df_cmap[df_cmap.fold_pair == fold_pair]
try:
MAX_RECALL_1 = df_cmap[df_cmap.recall_only_fold1 > df_cmap.recall_only_fold2] \
.sort_values(by='recall_only_fold1', ascending=False).iloc[0]
MAX_CLUSTER_RECALL_1 = MAX_RECALL_1.File[-7:-4]
except Exception as e:
print(f"[skip] MAX_RECALL_1: {e}")
try:
MAX_RECALL_2 = df_cmap[df_cmap.recall_only_fold1 < df_cmap.recall_only_fold2] \
.sort_values(by='recall_only_fold2', ascending=False).iloc[0]
MAX_CLUSTER_RECALL_2 = MAX_RECALL_2.File[-7:-4]
except Exception as e:
print(f"[skip] MAX_RECALL_2: {e}")
else:
print("[skip] df_cmap not available")
[skip] missing CSV: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/data/df_cmap_all.csv
[skip] df_cmap not available
In [16]:
viz_folder = VIZ_FOLDER
if MAX_CLUSTER_RECALL_1 != '':
file_max_recall_1 = f'{viz_folder}/msa_t__ShallowMsa_{MAX_CLUSTER_RECALL_1}_visualization_map_1_tol_0.npy'
else:
file_max_recall_1 = ''
if MAX_CLUSTER_RECALL_2 != '':
file_max_recall_2 = f'{viz_folder}/msa_t__ShallowMsa_{MAX_CLUSTER_RECALL_2}_visualization_map_2_tol_0.npy'
else:
file_max_recall_2 = ''
In [ ]:
In [17]:
# build AF npy path explicitly and debug-print
af1_npy = (f"{VIZ_FOLDER}/msa_t__ShallowMsa_{MAX_AF_1}_visualization_map_1_tol_0.npy"
if MAX_AF_1 else "")
print("VIZ_FOLDER:", VIZ_FOLDER, flush=True)
print("file_max_recall_1:", file_max_recall_1, "exists?", os.path.exists(file_max_recall_1), flush=True)
print("af1_npy:", af1_npy, "exists?", os.path.exists(af1_npy) if af1_npy else None, flush=True)
print("fold_pair:", fold_pair)
print("viz_folder:", viz_folder)
print("MAX_CLUSTER_RECALL_1:", MAX_CLUSTER_RECALL_1, "MAX_CLUSTER_RECALL_2:", MAX_CLUSTER_RECALL_2)
if df_cmap is not None and not df_cmap.empty:
if file_max_recall_1 and af1_npy and os.path.exists(file_max_recall_1) and os.path.exists(af1_npy):
af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_1)].recall_only_fold1.iloc[0]
plot_viz_cmap2(file_max_recall_1, af1_npy,
legend_plot=f'Best Recall Fold 1 ({getattr(MAX_RECALL_1, "recall_only_fold1", "NA")}) '
f'Vs Best AF 1 ({af_cmap_recall})')
elif file_max_recall_1 and os.path.exists(file_max_recall_1):
plot_viz_cmap(file_max_recall_1,
legend_plot=f'Best Recall Fold 1 ({getattr(MAX_RECALL_1, "recall_only_fold1", "NA")})')
elif af1_npy and os.path.exists(af1_npy):
af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_1)].recall_only_fold1.iloc[0]
plot_viz_cmap(af1_npy, legend_plot=f'Best AF 1 ({af_cmap_recall})')
else:
print("[skip] fold1 viz: missing both maps", flush=True)
else:
print("[skip] cmap fold1 viz: df_cmap not available", flush=True)
VIZ_FOLDER: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/4tsyD_3zwgN/output_cmap_esm/VizCmaps
file_max_recall_1: exists? False
af1_npy: exists? None
fold_pair: 4tsyD_3zwgN viz_folder: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/4tsyD_3zwgN/output_cmap_esm/VizCmaps MAX_CLUSTER_RECALL_1: MAX_CLUSTER_RECALL_2: [skip] cmap fold1 viz: df_cmap not available
In [18]:
af2_npy = (f"{VIZ_FOLDER}/msa_t__ShallowMsa_{MAX_AF_2}_visualization_map_2_tol_0.npy"
if MAX_AF_2 else "")
print("VIZ_FOLDER:", VIZ_FOLDER, flush=True)
print("file_max_recall_2:", file_max_recall_2, "exists?", os.path.exists(file_max_recall_2), flush=True)
print("af2_npy:", af2_npy, "exists?", os.path.exists(af2_npy) if af2_npy else None, flush=True)
if df_cmap is not None and not df_cmap.empty:
if file_max_recall_2 and af2_npy and os.path.exists(file_max_recall_2) and os.path.exists(af2_npy):
af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_2)].recall_only_fold2.iloc[0]
plot_viz_cmap2(file_max_recall_2, af2_npy,
legend_plot=f'Best Recall Fold 2 ({getattr(MAX_RECALL_2, "recall_only_fold2", "NA")}) '
f'Vs Best AF 2 ({af_cmap_recall})')
elif file_max_recall_2 and os.path.exists(file_max_recall_2):
plot_viz_cmap(file_max_recall_2,
legend_plot=f'Best Recall Fold 2 ({getattr(MAX_RECALL_2, "recall_only_fold2", "NA")})')
elif af2_npy and os.path.exists(af2_npy):
af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_2)].recall_only_fold2.iloc[0]
plot_viz_cmap(af2_npy, legend_plot=f'Best AF 2 ({af_cmap_recall})')
else:
print("[skip] fold2 viz: missing both maps", flush=True)
else:
print("[skip] cmap fold2 viz: df_cmap not available", flush=True)
VIZ_FOLDER: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/4tsyD_3zwgN/output_cmap_esm/VizCmaps
file_max_recall_2: exists? False
af2_npy: exists? None
[skip] cmap fold2 viz: df_cmap not available
In [ ]: